#!/bin/sh
# Nagios plugin to monitor Puppet agent state
#
# Copyright (c) 2011 Alexander Swen <a@swen.nu>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# Example configuration
#
# Typical this check is placed on a client and runs via nrpe.
# So add this to nrpe.cfg:
#  command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet
# or if you want to specify options (rather than have the script calculate key values and facts) then something like
#  command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0
# This should warn when the agent hasnt run for an hour and go critical after two hours
#  if you have dont_blame_nrpe=1 set you can choose to
#  command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$
#
# define service {
#  use generic-service
#  service_description Puppet agent
#  check_command check_nrpe!check_puppet_agent
# or
#  check_command check_nrpe!check_puppet_agent!3600!7200
#}
#
# Sudo required.
# The user running this script must be allowed using sudo to run puppet config print, e.g. in /etc/sudoers include the 3 lines
# User_Alias NAGIOS=nagios
# Cmnd_Alias PUPPETCHECK=/usr/bin/puppet config print all, \ # puppet 2
#                        /usr/bin/puppet config print, \ # puppet 3
#                        /usr/bin/puppet config print --section agent # other puppet version
# NAGIOS     ALL=NOPASSWD:PUPPETCHECK
#
# CHANGELOG:
# 20120126	A.Swen	    created.
# 20120214  trey85stang Modified, added getopts, usage, defaults.
# 20120220  A.Swen      lastrunfile can be overriden.
# 20130717  A.Swen      Moved finding lastrunfile to after getopts and made it conditional to param -s.
#                       Added option to tell script if puppet agent is started from cron or as a daemon (-d).
#                       Switched to use awk to filter values from lastrunfile and set them as params.
#                       Updated some comments.
#                       Removed bug in search for process (that would previously always find something because grep find it's processline).
#                       "puppet agent --configprint lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state.
#                       Based on feedback Михайло Масик updated:
#                       - Puppet --configprint => puppet agent --configprint (version 3 has new way of printing config).
#                       - Added new pattern to search for process.
#                       - Added test kill -0 to see if process is still there.
# 20130725  A.Swen      Based on feedback Михайло Масик updated a test (removed ! from test).
# 20130725  A.Swen      Added sudo to puppet config print pidfile.
# 20131209  Mark Ruys   Issue warning when last_run_report.yaml contain errors.
# 20141015  A.Swen      Add show disabled status.
# 20141127  KissT       Remove requirement to have sudo custom rule.
# 20150917  A.Swen      Based on an idea of Daniel Lawrence check for major version to decide how to print config.
#                       Based on idea of D.Stirling switched to sh.
#                       Findout puppet executable location using which.
#                       Based on an idea of D.Stirling updated daemon check.
#                       Based on an idea of D.Stirling made BSD compattible.
#                       Based on an idea of BTriller fix the getopts command to parse the agent_disabled_lockfile option.
# 20151201  Akomakom    Add perf data option.
#                       More reliable yaml parsing.
#                       If $HOME not set: set it.
#                       Fix PS command for Suse.
# 20151218  K.A. Gillow Calculate warn/crit based on runinterval and splay setting rather than use fixed settings.
#                       Check system has been up longer than crit/warn time otherwise don't yet trigger normally relevant fault levels.
#                       We never generally want puppet disabled so change to warning.
# 20151229  A.Swen      Fix bug in PERF_DATA (replace compset by set).
#                       Prettify $PERF_DATA output.
# 20160201  S. Sams     Changes to PERF_DATA output format to increase compatibility with Nagios Plugin guidelines.
#                       Add compatibility with Puppet 4.x
# 20160315  J. Yaworski Add -v, allowing to pass a version to compare
# 20160815  L. Buriola  Add -E to show first error on output
# 20170426  benwtr      Detect failure to retrieve catalog from server as a warning.
# 20180324  deric       Discard puppet config error (logging) output

# FUNCTIONS
currentcpuload=`uptime | awk -F"load average:" '{print $NF}'|awk -F" " '{print $1}'|awk -F"." '{print $1}'`
if [ $currentcpuload -ge 100 ];then
    echo "cpuload is $currentcpuload"
    exit $UNKNOWN
fi

result () {
  case $1 in
    0) echo "OK: Puppet agent $version running catalog version $config, and executed at $last_run_human for last time. $PERF_DATA";rc=0 ;;
    1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;;
    2) echo "WARNING: Last run was $time_since_last seconds ago. Warn is $WARN. $PERF_DATA";rc=1 ;;
    3) echo "WARNING: Last run was $time_since_last seconds ago. Crit is $CRIT. $PERF_DATA";rc=1 ;;
    4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;;
    5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;;
    6) echo "WARNING: Last run had 1 or more errors. Check the logs. $FIRST_ERROR $PERF_DATA";rc=1 ;;
    7) echo "DISABLED: Reason: $(sed -e 's/{"disabled_message":"//' -e 's/"}//' $agent_disabled_lockfile). $PERF_DATA";rc=1 ;;
    8) echo "UNKNOWN: No Puppet executable found";rc=3 ;;
    9) echo "UNKNOWN: Internal error: $2"; rc=3 ;;
   10) echo "OK (PROBABLY): Puppet agent last successful run $last_run_human (runinterval $runinterval, splay $splay, splaylimit $splay limit) but system has not been up long enough to guarantee a fresh puppet run should have occurred";rc=0 ;;
   11) echo "INFO: Puppet agent is version $version, but should be $wanted_version. $PERF_DATA";rc=0 ;;
   12) echo "UNKNOWN: last_run_report.yaml not found, not readable or incomplete";rc=3 ;;
   13) echo "WARNING: Failed to retrieve catalog on last run.";rc=1 ;;
   14) echo "UNKNOWN: No sudo executable found";rc=3 ;;
  esac
  exit $rc
}

usage () {
  echo ""
  echo "USAGE: "
  echo "  $0 [-c 7200] [-w 3600] [-d 0] [-l agent_disabled_lockfile] [-s lastrunfile] [-r lastrunreport] [-v wanted_version] [-PEh]"
  echo "    -c Critical threshold (default 7200 seconds)"
  echo "    -w Warning threshold (default 3600 seconds)"
  echo "    -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)"
  echo "    -h Show this help."
  echo "    -l Agent_disabled_lockfile (default: /var/lib/puppet/state/agent_disabled.lock)"
  echo "    -s Lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)"
  echo "    -r Lastrunreport (default: /var/lib/puppet/state/last_run_report.yaml)"
  echo "    -P Enable perf_data in the output"
  echo "    -E Show first error in the output"
  echo "    -v The version of puppet that should be running"
  echo ""
  exit 1
}

# Get a flat representation of yaml without relying on external tools.
parse_yaml () {
   local prefix=$2
   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
   sed -ne "s|^\($s\):|\1|" \
        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
   awk -F$fs '{
      indent = length($1)/2;
      vname[indent] = $2;
      for (i in vname) {if (i > indent) {delete vname[i]}}
      if (length($3) > 0) {
         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
      }
   }'
}

# Get first error from last_run_report.yaml
get_first_error() {
  grep_cmd="/bin/grep -B 3 -A 1"
  first_error_time=$($grep_cmd "status: failure" $lastrunreport | grep "time: " | sort -n | head -1)
  first_error=$($grep_cmd "$first_error_time" $lastrunreport | grep "message: " | sed 's/.*message: //' | head -1)
  echo "FIRST_ERROR ($first_error)"
}

# SCRIPT
# Fix home directory if needed
# On Gentoo otherwise we get
# UNKNOWN: Internal error: Puppet version unknown from Error: Could not initialize global default settings: Permission denied @ dir_s_mkdir - /root/.puppetlabs
# this happens because $HOME is not set to the user one
export HOME=$(eval echo "~$(whoami)")
#
while getopts "c:d:l:s:r:w:v:PEh" opt; do
  case $opt in
    c)
      if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
      then
        CRIT=$OPTARG
      else
        usage
      fi
    ;;
    d)
      # argument should be 0 or 1
      if [ $OPTARG -eq 0 -o $OPTARG -eq 1 ];then
        daemonized=$OPTARG
      else
        usage
      fi
    ;;
    h) usage ;;
    l) agent_disabled_lockfile=$OPTARG ;;
    s) lastrunfile=$OPTARG ;;
    r) lastrunreport=$OPTARG ;;
    w)
      if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
      then
        WARN=$OPTARG
      else
        usage
      fi
    ;;
    P)
      PERF=true
    ;;
    E)
      SHOW_ERROR=true
    ;;
    v)
      wanted_version=$OPTARG
    ;;
    *)
      usage
    ;;
  esac
done

parse_puppet_config () {
  echo "$puppet_config_output" | while read key value; do
    if [ "$key" = "$1" ]; then
        echo "${value#= }"
    fi
  done
}

[ -z "$HOME" ] && export HOME=$(getent passwd `whoami` | cut -d: -f6)  # Some clean environment situations make puppet -V fail.

# Ensure installation directory of puppet 4 is included in PATH
PATH="$PATH:/opt/puppetlabs/bin"
# Find location of puppet executable.
PUPPET=$(which puppet) || result 8
# Check if sudo installed
which sudo 1>/dev/null 2>&1 || result 14

# Find out Puppet major version to determine configprint syntax.
puppet_major_version=$($PUPPET -V|cut -d. -f1)

[ -z "$puppet_major_version" ] && result 9 "Puppet version unknown from $($PUPPET -V 2>&1)"

# Set Puppet configprint syntax.
case $puppet_major_version in
  2)
    puppet_config_print="sudo $PUPPET config print all"
    ;;
  3)
    puppet_config_print="sudo $PUPPET config print"
    ;;
  *)
    puppet_config_print="sudo $PUPPET config print --section agent"
    ;;
esac

puppet_config_output="$($puppet_config_print 2> /dev/null)"
# construct WARN and CRIT times based on runinterval plus a safety buffer
# if they have not already been explicitly set
runinterval=$(parse_puppet_config "runinterval")
splaylimit=0
splay=$(parse_puppet_config "splay")
[ "$splay" != "false" ] && splaylimit=$(parse_puppet_config "splaylimit")
[ -z "$WARN" ] && WARN=$(($runinterval + $splaylimit))
[ -z "$CRIT" ] && CRIT=$(($WARN + $runinterval))
#now check we finally have some sensible settings
[ -z "$WARN" -o $WARN -lt 30 ] && result 5
[ -z "$CRIT" -o $CRIT -lt 60 ] && result 5

# If the disabled lockfile is not given as a param try to find it ourselves.
[ -z "$agent_disabled_lockfile" ] && agent_disabled_lockfile=$(parse_puppet_config "agent_disabled_lockfile")
# If there's a disabled.lock file don't look any further.
[ -f "$agent_disabled_lockfile" ] && result 7

# If the lastrunfile is not given as a param try to find it ourselves.
[ -z "$lastrunfile" ] && lastrunfile=$(parse_puppet_config "lastrunfile")
# Check if state file exists.
[ -s $lastrunfile -a -r $lastrunfile ] || result 1

# If the lastrunreport is not given as a param try to find it ourselves.
[ -z "$lastrunreport" ] && lastrunreport=$(parse_puppet_config "lastrunreport")
# Check if the lastrunreport is readable
[ -r "$lastrunreport" ] || result 12
# Check if state file exists.
[ -n "$SHOW_ERROR" ] && ( [ -s $lastrunreport -a -r $lastrunreport ] || result 12 )

# Check if daemonized was set, else set default to 1.
[ -n "$daemonized" ] || daemonized=1
# If Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron.
if [ $daemonized -eq 1 ];then
  # Puppet version 4 changed several paths, determine correct ones
  if [ $puppet_major_version -ge 4 ];then
    puppet_daemon_rundir="puppetlabs"
    puppet_daemon_regex="/opt/puppetlabs/puppet/bin/ruby /opt/puppetlabs/puppet/bin/puppet"
  else
    puppet_daemon_rundir="puppet"
    puppet_daemon_regex="/usr(/local)?/bin/ruby[^ ]* /usr(/local)?/s?bin/puppetd?"
  fi

  # Check puppet daemon:
  [ "$(ps axfww|egrep "$puppet_daemon_regex"|grep -v egrep)" ] || result 4

  uname -a|grep -q BSD && default_pidfile=/var/$puppet_daemon_rundir/run/agent.pid || default_pidfile=/var/run/$puppet_daemon_rundir/agent.pid
  [ -e $default_pidfile ] && pidfile=$default_pidfile || pidfile=$(parse_puppet_config "pidfile")

  # If there is a pidfile tell me the pid, else fail.
  [ -f $pidfile ]&&pid=$(cat $pidfile)||result 4

  # See if the process is running.
  ps -p $pid > /dev/null || result 4

  # On Linux test if the pid we found in the pidfile is puppet:
  if uname -a|grep -q Linux;then
    grep -q puppet /proc/$pid/cmdline ||result 4
  fi
fi

# parse last run file
# puppet version 4 files have less intendation, add prefix to match parsed variables from older versions
[ $puppet_major_version -ge 4 ] && yaml_prefix="_"
eval $(parse_yaml $lastrunfile $yaml_prefix)
# this flattens the hierarchy to single-level name/value variables, eg:
# _events_total="14"
# _version_config="1448907293"

# Construct perf data using anything that starts with "_resources_ or _time_total"
if [ -n "$PERF" ] ; then
  for V in $(set | grep "^_resources_\|^_time_total") ; do
   PERF_DATA="$(echo $V | sed 's/^_//' | sed "s/='/=/" | sed "s/'$//") $PERF_DATA"
  done
  PERF_DATA="| $PERF_DATA"
fi

# Construct FIRST_ERROR using last_run_report.yaml
if [ -n "$SHOW_ERROR" ] ; then
  FIRST_ERROR=$(get_first_error)
fi

# If the last run failed to retrieve the catalog from the server
grep -q 'Could not retrieve catalog from remote server' $lastrunreport && result 13

# Check when last run happened.
last_run=$_time_last_run
last_run_human=$(date -d @$last_run +%c)
now=$(date +%s)

# Check how long system been up in seconds
uptime=$(cut -f1 -d' ' /proc/uptime | cut -f1 -d.)

# Assess last run time relative to warn/crit values and system uptime.
time_since_last=$((now-last_run))
[ $time_since_last -ge $CRIT -a $uptime -ge $CRIT ] && result 3
[ $time_since_last -ge $CRIT -a $uptime -lt $CRIT ] && result 10
[ $time_since_last -ge $WARN -a $uptime -ge $WARN ] && result 2
[ $time_since_last -ge $WARN -a $uptime -lt $WARN ] && result 10

# Get some more info from the yaml file.
config=$_version_config
version=$_version_puppet
failed=$_resources_failed
failure=$_events_failure
failed_to_restart=$_resources_failed_to_restart

# If any of the values above doesn't return raise an error.
[ -z "$last_run" -o -z "$config" -o -z "$version" -o -z "$failed" -o -z "$failure" -o -z "$failed_to_restart" ] && result 1
# If anything went wrong last run => crit.
[ $failed -gt 0 -o  $failure -gt 0 -o $failed_to_restart -gt 0 ] && result 6

# If $wanted_version is set, compare it to the running version
if [ -n "$wanted_version" -a -n "$version" ]; then
  [ "$wanted_version" != "$version" ] && result 11
fi

# If we reached here all is ok.
result 0

# END
